## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0       ✔ purrr   0.3.0  
## ✔ tibble  2.0.1       ✔ dplyr   0.8.0.1
## ✔ tidyr   0.8.3       ✔ stringr 1.4.0  
## ✔ readr   1.3.1       ✔ forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.5.2
## Warning: package 'purrr' was built under R version 3.5.2
## Warning: package 'dplyr' was built under R version 3.5.2
## Warning: package 'stringr' was built under R version 3.5.2
## Warning: package 'forcats' was built under R version 3.5.2
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
variable_labels <- readRDS('variable_level_labels_ALL_f.rds') %>% mutate(
  var_lvl = case_when(!is.na(lvl) ~ paste(var, lvl, sep="_"),
                      TRUE ~ var)
)

variable_label_unique <- variable_labels %>% group_by(var_lvl) %>% summarize(var=first(var), var_lbl=first(var_lbl), lvl_lbl=first(lvl_lbl))

load('./meta_data/meta_country_simple.Rdata')
categories <- read_tsv('./dd.common.final.tsv')
## Parsed with column specification:
## cols(
##   Variable = col_character(),
##   `Variable Description` = col_character(),
##   kc = col_character(),
##   cjp = col_character(),
##   jc = col_character(),
##   ebd = col_character(),
##   echow = col_character(),
##   category = col_character()
## )
meta_country_simple <- merge(meta_country_simple, categories[ , c('Variable', 'category')], by.x='name', by.y='Variable', all.x=T)
meta_country_simple <- (meta_country_simple %>% left_join(variable_label_unique, by= c("name"="var_lvl")))

R-squared within each country

p <- ggplot(meta_country_simple %>% filter(!is.na(beta)), aes(mean_r2, -log10(pvalue), color=factor(I(-log10(pvalue) > 10 ))))
p <- p  + geom_point(alpha=.1) +scale_x_continuous(limits=c(0, .2))+ facet_grid(country~gender)
p <- p + geom_hline(yintercept=10)
p <- p + theme(legend.position='none')
p
## Warning: Removed 130 rows containing missing values (geom_point).

ECDF of R2 for each country

p <- ggplot(meta_country_simple %>% filter(!is.na(beta)), aes(mean_r2,color=factor(I(-log10(pvalue) > 10)) ))
p <- p + stat_ecdf() + scale_x_continuous(limits=c(0, .05)) + facet_grid(country~gender)
p <- p + theme(legend.position='bottom')
p
## Warning: Removed 388 rows containing non-finite values (stat_ecdf).

p <- ggplot(meta_country_simple %>% filter(!is.na(beta), -log10(pvalue) > 10), aes(mean_r2, -log10(pvalue)))
p <- p  + geom_point(alpha=.05) + scale_x_continuous(limits=c(0, .2))+ facet_grid(country~gender)
p
## Warning: Removed 44 rows containing missing values (geom_point).

num_sig_per_country_f <- (meta_country_simple %>% filter(gender == 'f') %>% split(.$name) %>% map(function(x) {
  tibble(country=x$country, gender = 'f', sig=x$pvalue<1e-10, mean_r2=x$mean_r2,beta=x$beta)
}))
num_sig_per_country_m <- (meta_country_simple %>% filter(gender == 'm') %>% split(.$name) %>% map(function(x) {
  tibble(country=x$country, gender = 'm', sig=x$pvalue<1e-10, mean_r2=x$mean_r2,beta=x$beta)
}))



num_f <- num_sig_per_country_f %>% map_df(function(x) tibble(nsig=sum(x$sig), npos=sum((x$beta[x$sig]>0)), nneg=sum((x$beta[x$sig]<0)), mean_r2= mean(x$mean_r2), ncountries=nrow(x))) %>% mutate(var=names(num_sig_per_country_f), pct=nsig/ncountries,gender='f')

num_m <- num_sig_per_country_m %>% map_df(function(x) tibble(nsig=sum(x$sig), npos=sum((x$beta[x$sig]>0)), nneg=sum((x$beta[x$sig]<0)), mean_r2= mean(x$mean_r2), ncountries=nrow(x))) %>% mutate(var=names(num_sig_per_country_m), pct=nsig/ncountries,gender='m')

number_found <- rbind(num_f, num_m)

varDesc <- unique(meta_country_simple [ , c('name', 'var', 'var_lbl', 'lvl_lbl', 'category')])
number_found <- merge(number_found, varDesc)
p <- ggplot(number_found, aes(pct)) 
p <- p + geom_histogram() + facet_wrap(~gender)
p 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).

Output all

number_found <- number_found %>% arrange(desc(ncountries), desc(pct))
number_found
write_csv(number_found , path='./meta_data/number_found_per_variable.csv')
write_csv(number_found %>% filter(npos > 0, nneg > 0),path='./meta_data/number_opposite_sign_per_variable.csv')